"""
This code processes the raw data files from the MSO for research. Code was written using python 3. "MSO" has been
used in the code to protect the name of the operator.

by: Jacob Malone (j.malone@cablelabs.com)
date: August 25, 2021
"""


import datetime as dt
from glob import glob
from time import ctime

import pandas as pd

# Define directories where data will be loaded and output will be saved
out_dir = '/mnt/et01/cord_cutting_rep'
csv_dir = '/mnt/et01/mso/new_flow_2015/csv'
market_files = glob(f'{csv_dir}/merge1-market-*.csv')

# This file provides the correct list of "customer_key"s needed to filter the data, so the samples
# match between the 2 time periods
demo_frame = pd.read_csv('demo.csv')
demo_keys = demo_frame.customer_key.unique()


def convert_stata_dates(num_days):
    """A function that allows me to convert the Stata date output to a datetime object in python"""
    return dt.date(1960, 1, 1) + dt.timedelta(days=num_days)


# This function loops through each out the "market_files" we need to process
for lf in market_files:
    print(f'Processing {lf}.... {ctime()}')

    # Load raw CSV file
    frame = pd.read_csv(lf)

    # Filter the data down to the set of "customer_key"s of interest
    frame = frame.loc[frame.customer_key.isin(demo_keys)].copy()
    if frame.empty:
        continue

    # Calculates total usage from the downstream and upstream usage columns. Variable is in gigabytes
    frame['tot_gb'] = frame['tot_gbdn'] + frame['tot_gbup']

    # Total usage is broken out into proportions based on DPI data. This section of code ensures that 
    # we do not have any missing data in these columns and that they sum to 1.
    print(f'Missing values in proportion variables {frame["prop_gaming"].isnull().sum()}....')
    prop_cols = [col for col in frame.columns if col.startswith('prop_')]
    for col in prop_cols:
        frame[col] = frame[col].fillna(0)
    print(f'Missing values in proportion variables {frame["prop_gaming"].isnull().sum()}....')

    frame['prop_sum_check'] = frame[prop_cols].sum(axis=1)
    print(frame['prop_sum_check'].describe())
    frame = frame.drop(['prop_sum_check'], axis=1)

    # Calculate levels of usage. The overall usage value comes from IPDR data and the proportions come from DPI.
    # These two columns are multiplied together to calculate total usage by category.
    print(f'Calculating levels.... {ctime()}')
    traffic_types = ['admin',
                     'backup',
                     'cdn',
                     'gaming',
                     'music',
                     'sharing',
                     'streaming',
                     'tunnel',
                     'other',
                     'vidamazon',
                     'vidflash',
                     'vidhbo',
                     'vidhulu',
                     'vidnetflix',
                     'vidyoutube',
                     'viddish',
                     'vidslingtv',
                     'vidother',
                     'browsinstagram',
                     'browsamazon',
                     'browsapple',
                     'browscnn',
                     'browsespn',
                     'browsgoogle',
                     'browshttp',
                     'browshttpdl',
                     'browsfbook',
                     'browsadult',
                     'browsother']
    for level in traffic_types:
        frame['level_' + level] = frame['prop_' + level] * frame['tot_gb']
        frame = frame.drop(['prop_' + level], axis=1)

    # This section of code ensures that the category usage variables calculated about match the total usage
    # variable from IPDR.
    level_cols = [col for col in frame.columns if col.startswith('level_')]
    frame['level_sum_check'] = frame[level_cols].sum(axis=1)
    frame['level_diff_check'] = frame['tot_gb'] - frame['level_sum_check']
    print(frame['level_diff_check'].describe())
    frame = frame.drop(['level_sum_check', 'level_diff_check'], axis=1)

    # Calculate a usage column for OTT video.
    video_cols = [col for col in frame.columns if col.startswith('level_vid')]
    frame['level_video'] = frame[video_cols].sum(axis=1)

    # Calculate a usage column for Browsing usage.
    browsing_cols = [col for col in frame.columns if col.startswith('level_brows')]
    frame['level_browsing'] = frame[browsing_cols].sum(axis=1)

    # Create a list of the category usage variables that instead of using the invididual application breakouts (e.g., Netflix,
    # YouTube) for OTT video and Browsing uses the aggregated variables created above.
    level_cols = [col for col in frame.columns if
                  col.startswith('level_') and not col.startswith('level_vid') and not col.startswith('level_brows')]
    level_cols.append('level_video')
    level_cols.append('level_browsing')

    # Now that we've validated the usage columns, do a final calculation to ensure the total usage and category variables
    # match perfectly.
    frame['tot_gb'] = frame[level_cols].sum(axis=1)

    # Reduce data to the "customer_key-date-hour" level of observation
    print('Collpasing data.... {0}'.format(ctime()))
    reduce_sum_columns = ['tot_gbdn',
                          'tot_gbup',
                          'tot_pkts_passup',
                          'tot_pkts_dropup',
                          'tot_pkts_delayup',
                          'tot_pkts_passdn',
                          'tot_pkts_dropdn',
                          'tot_pkts_delaydn',
                          'level_admin',
                          'level_backup',
                          'level_cdn',
                          'level_gaming',
                          'level_music',
                          'level_sharing',
                          'level_streaming',
                          'level_tunnel',
                          'level_other',
                          'level_vidamazon',
                          'level_vidflash',
                          'level_vidhbo',
                          'level_vidhulu',
                          'level_vidnetflix',
                          'level_vidyoutube',
                          'level_viddish',
                          'level_vidslingtv',
                          'level_vidother',
                          'level_video',
                          'tot_gb',
                          'level_browsinstagram',
                          'level_browsamazon',
                          'level_browsapple',
                          'level_browscnn',
                          'level_browsespn',
                          'level_browsgoogle',
                          'level_browshttp',
                          'level_browshttpdl',
                          'level_browsfbook',
                          'level_browsadult',
                          'level_browsother',
                          'level_browsing']
    reduce_first_columns = ['merge1']

    collapse_dict = {}
    for col in reduce_sum_columns:
        collapse_dict[col] = 'sum'
    for col in reduce_first_columns:
        collapse_dict[col] = 'first'

    frame = frame.groupby(['customer_key', 'date', 'hr']).agg(collapse_dict).reset_index()

    # Convert the date variables into native datetime objects
    print('Cleaning dates.... {0}'.format(ctime()))
    frame['date'] = frame['date'].apply(convert_stata_dates)
    frame['date'] = pd.to_datetime(frame['date'], infer_datetime_format=True)

    # Sort the data
    frame = frame.sort_values(by=['customer_key', 'date', 'hr'])

    # Save output file
    print('Saving file.... {0}'.format(ctime()))
    out_file = f'{out_dir}/{lf.split("/")[-1].split(".")[0]}_processed_May2020.parquet'
    frame.to_parquet(out_file)

    # Set the data object to None to assist with garbage collection in python
    frame = None
